This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(Metrics)
library(readr)
library(ggplot2)#for visualisation
library(corrplot)#for visualisation of correlation
## corrplot 0.92 loaded
library(mlbench)
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(plotly)#converting ggplot to plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
library(lattice)
library(caret)
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(caTools)#for splittind data into testing and training data
library(dplyr) #manipulating dataframe
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mlbench)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read_csv("C:/Users/FD_gi/Documents/Regression lineal/data/HousingPrices-Amsterdam-August-2021.csv")
## New names:
## Rows: 924 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Address, Zip dbl (6): ...1, Price, Area, Room, Lon, Lat
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
missmap(data,col=c('yellow','black'),y.at=1,y.labels='',legend=TRUE)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
#Checking for NA and missing values and removing them.
numberOfNA <- length(which(is.na(data)==T))
numberOfNA
## [1] 4
# Remove NA values
data <- data %>%
drop_na()
str(data)
## tibble [920 × 8] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:920] 1 2 3 4 5 6 7 8 9 10 ...
## $ Address: chr [1:920] "Blasiusstraat 8 2, Amsterdam" "Kromme Leimuidenstraat 13 H, Amsterdam" "Zaaiersweg 11 A, Amsterdam" "Tenerifestraat 40, Amsterdam" ...
## $ Zip : chr [1:920] "1091 CR" "1059 EL" "1097 SM" "1060 TH" ...
## $ Price : num [1:920] 685000 475000 850000 580000 720000 450000 450000 590000 399000 300000 ...
## $ Area : num [1:920] 64 60 109 128 138 53 87 80 49 33 ...
## $ Room : num [1:920] 3 3 4 6 5 2 3 2 3 2 ...
## $ Lon : num [1:920] 4.91 4.85 4.94 4.79 4.9 ...
## $ Lat : num [1:920] 52.4 52.3 52.3 52.3 52.4 ...
dim(data)
## [1] 920 8
# Removing rownumbers
data$...1 <- NULL
dim(data)
## [1] 920 7
#remove zip and address
data$Zip <- NULL
data$Address <- NULL
dim(data)
## [1] 920 5
library(corrplot)
str(data)
## tibble [920 × 5] (S3: tbl_df/tbl/data.frame)
## $ Price: num [1:920] 685000 475000 850000 580000 720000 450000 450000 590000 399000 300000 ...
## $ Area : num [1:920] 64 60 109 128 138 53 87 80 49 33 ...
## $ Room : num [1:920] 3 3 4 6 5 2 3 2 3 2 ...
## $ Lon : num [1:920] 4.91 4.85 4.94 4.79 4.9 ...
## $ Lat : num [1:920] 52.4 52.3 52.3 52.3 52.4 ...
corrplot(cor(data))
corrplot(cor(data),method='number')
# Highly correlated variables
correlated <- cor(data)
highCorr <- findCorrelation(correlated, cutoff=0.70)
highCorr
## [1] 2
names(data[highCorr])
## [1] "Area"
summary(data)
## Price Area Room Lon
## Min. : 175000 Min. : 21.00 Min. : 1.000 Min. :4.645
## 1st Qu.: 350000 1st Qu.: 60.00 1st Qu.: 3.000 1st Qu.:4.856
## Median : 467000 Median : 83.00 Median : 3.000 Median :4.887
## Mean : 622065 Mean : 95.61 Mean : 3.564 Mean :4.889
## 3rd Qu.: 700000 3rd Qu.:113.00 3rd Qu.: 4.000 3rd Qu.:4.922
## Max. :5950000 Max. :623.00 Max. :14.000 Max. :5.029
## Lat
## Min. :52.29
## 1st Qu.:52.35
## Median :52.36
## Mean :52.36
## 3rd Qu.:52.38
## Max. :52.42
data
## # A tibble: 920 × 5
## Price Area Room Lon Lat
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 685000 64 3 4.91 52.4
## 2 475000 60 3 4.85 52.3
## 3 850000 109 4 4.94 52.3
## 4 580000 128 6 4.79 52.3
## 5 720000 138 5 4.90 52.4
## 6 450000 53 2 4.88 52.4
## 7 450000 87 3 4.90 52.4
## 8 590000 80 2 4.87 52.4
## 9 399000 49 3 4.85 52.4
## 10 300000 33 2 4.90 52.4
## # ℹ 910 more rows
#Let’s split the loaded dataset into train and test sets. We will use 75% of the data to train our models and 20% will be used to test the models..
set.seed(123)
ind <- sample(2, nrow(data), prob = c(0.8, 0.2), replace = T)
train <- data[ind == 1, ]
test <- data[ind == 2,]
dim(data)
## [1] 920 5
dim(train)
## [1] 737 5
dim(test)
## [1] 183 5
lm_model <- lm(Price ~ .,
data = train)
lm_model
##
## Call:
## lm(formula = Price ~ ., data = train)
##
## Coefficients:
## (Intercept) Area Room Lon Lat
## -39179798 9095 -61030 -444657 789084
summary(lm_model)
##
## Call:
## lm(formula = Price ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1665296 -133835 9973 112256 2368954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.918e+07 2.111e+07 -1.856 0.0638 .
## Area 9.095e+03 2.929e+02 31.049 < 2e-16 ***
## Room -6.103e+04 1.053e+04 -5.795 1.02e-08 ***
## Lon -4.447e+05 1.840e+05 -2.417 0.0159 *
## Lat 7.891e+05 3.992e+05 1.977 0.0484 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 260000 on 732 degrees of freedom
## Multiple R-squared: 0.7456, Adjusted R-squared: 0.7443
## F-statistic: 536.5 on 4 and 732 DF, p-value: < 2.2e-16
#Predict
pLm <- predict(lm_model,test)
postResample(pLm,test$Price)
## RMSE Rsquared MAE
## 3.961636e+05 6.054453e-01 2.017786e+05
#Cross validation
x <- data.matrix(train)
y <- train$Price
control <- trainControl(method = "cv",
number = 10)
lineerCV <- train(Price~.,
data = train,
method = "lm",
trControl = control )
lineerCV
## Linear Regression
##
## 737 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 664, 663, 663, 664, 663, 664, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 259936.1 0.7009155 172590.4
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lineerCV)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1665296 -133835 9973 112256 2368954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.918e+07 2.111e+07 -1.856 0.0638 .
## Area 9.095e+03 2.929e+02 31.049 < 2e-16 ***
## Room -6.103e+04 1.053e+04 -5.795 1.02e-08 ***
## Lon -4.447e+05 1.840e+05 -2.417 0.0159 *
## Lat 7.891e+05 3.992e+05 1.977 0.0484 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 260000 on 732 degrees of freedom
## Multiple R-squared: 0.7456, Adjusted R-squared: 0.7443
## F-statistic: 536.5 on 4 and 732 DF, p-value: < 2.2e-16
#Predict
pLmCV <- predict(lineerCV,test)
postResample(pLmCV,test$Price)
## RMSE Rsquared MAE
## 3.961636e+05 6.054453e-01 2.017786e+05
plLinearSimple <-test %>%
ggplot(aes(Price,pLm)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of Price') +
ylab('Predicted value of Price')+
theme_bw()
ggplotly(plLinearSimple)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ridge <- train(Price~.,
data = train,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0,
lambda = seq(0.0001,1,length=50)),
trControl = control )
pRidge <- predict(ridge,test)
postResample(pRidge,test$Price)
## RMSE Rsquared MAE
## 4.008932e+05 6.135220e-01 1.959027e+05
plRidge <-test %>%
ggplot(aes(Price,pRidge)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of Price') +
ylab('Predicted value of Price')+
theme_bw()
ggplotly(plRidge)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
lasso <- train(Price~.,
data = train,
method = "glmnet",
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.0001,1,length=50)),
trControl = control )
pLasso <- predict(lasso,test)
postResample(pLasso,test$Price)
## RMSE Rsquared MAE
## 3.959416e+05 6.064406e-01 2.013277e+05
plLasoo <-test %>%
ggplot(aes(Price,pLasso)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of Price') +
ylab('Predicted value of Price')+
theme_bw()
ggplotly(plLasoo)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
library(catboost)
#Separate x and y of train and test dataset, which will very useful when we using this in the catboost package.
library(dplyr)
y_train <- unlist(train[c('Price')])
X_train <- train %>% select(-Price)
y_valid <- unlist(test[c('Price')])
X_valid <- test %>% select(-Price)
#Convert the train and test dataset to catboost specific format using the load_pool function by mentioning x and y of both train and test.
train_pool <- catboost.load_pool(data = X_train, label = y_train)
test_pool <- catboost.load_pool(data = X_valid, label = y_valid)
#Create an input params for the CatBoost regression.
params <- list(iterations=500,
learning_rate=0.01,
depth=10,
loss_function='RMSE',
eval_metric='RMSE',
random_seed = 55,
od_type='Iter',
metric_period = 50,
od_wait=20,
use_best_model=TRUE)
modelCatboost <- catboost.train(learn_pool = train_pool,params = params)
## You should provide test set for use best model. use_best_model parameter has been switched to false value.
## 0: learn: 511270.0935606 total: 151ms remaining: 1m 15s
## 50: learn: 391212.1848769 total: 678ms remaining: 5.97s
## 100: learn: 310950.4012498 total: 1.24s remaining: 4.91s
## 150: learn: 256155.2699471 total: 1.73s remaining: 4s
## 200: learn: 217953.7878002 total: 2.27s remaining: 3.38s
## 250: learn: 191625.5903299 total: 2.78s remaining: 2.75s
## 300: learn: 173229.6150395 total: 3.22s remaining: 2.13s
## 350: learn: 159696.8924010 total: 3.74s remaining: 1.59s
## 400: learn: 148480.9119280 total: 4.26s remaining: 1.05s
## 450: learn: 138989.5437633 total: 4.75s remaining: 516ms
## 499: learn: 131528.7954719 total: 5.26s remaining: 0us
y_pred=catboost.predict(modelCatboost,test_pool)
catboostMetrics <- postResample(y_pred,test$Price)
catboostMetrics
## RMSE Rsquared MAE
## 3.239090e+05 7.841589e-01 1.144889e+05
plCatboost <-test %>%
ggplot(aes(Price,y_pred)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of Price') +
ylab('Predicted value of Price')+
theme_bw()
ggplotly(plCatboost)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'